import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np
df=pd.read_csv(r"C:\Users\Pratiksha Bargal\Downloads\diabetes.csv")
df.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
df.tail()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
df.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
df["Glucose"]=df["Glucose"].replace(0,np.nan)
df["BloodPressure"]=df["BloodPressure"].replace(0,np.nan)
df["SkinThickness"]=df["SkinThickness"].replace(0,np.nan)
df["Insulin"]=df["Insulin"].replace(0,np.nan)
df["BMI"]=df["BMI"].replace(0,np.nan)
df.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148.0 | 72.0 | 35.0 | NaN | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85.0 | 66.0 | 29.0 | NaN | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183.0 | 64.0 | NaN | NaN | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89.0 | 66.0 | 23.0 | 94.0 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137.0 | 40.0 | 35.0 | 168.0 | 43.1 | 2.288 | 33 | 1 |
df.describe()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 763.000000 | 733.000000 | 541.000000 | 394.000000 | 757.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 121.686763 | 72.405184 | 29.153420 | 155.548223 | 32.457464 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 30.535641 | 12.382158 | 10.476982 | 118.775855 | 6.924988 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 44.000000 | 24.000000 | 7.000000 | 14.000000 | 18.200000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 64.000000 | 22.000000 | 76.250000 | 27.500000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 29.000000 | 125.000000 | 32.300000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 141.000000 | 80.000000 | 36.000000 | 190.000000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
imputed_array = imputer.fit_transform(df)
imputed_df = pd.DataFrame(imputed_array, columns=df.columns)
df = imputed_df
plt.figure(figsize=(5,5))
plt.pie(df['Outcome'].value_counts(),labels=['Non-diabetic','Diabetic'],radius=1,
autopct='%1.1f%%',labeldistance=1.15)
plt.legend(title = 'Outcome:',loc='upper right', bbox_to_anchor=(1.6,1))
plt.show()
plt.figure(figsize=(14, 8))
sns.boxplot(df)
plt.title("the columns before handling outliers")
plt.show()
for col in df:
# Calculate IQR and identify potential outliers
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Clip the values to the lower and upper bounds
clipped_values = np.clip(df[col], lower_bound, upper_bound)
# Assign the clipped values back to the DataFrame
df[col] = clipped_values
plt.figure(figsize=(14, 6))
sns.boxplot(data=df)
plt.title("the columns after handling outliers")
plt.show()
df.hist(bins=50, figsize=(20,15));
#correlation map
f,ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax);
sns.scatterplot(x='Glucose', y='Outcome', data=df)
plt.title('Scatter Plot between Glucose and Outcome')
plt.xlabel('Glucose')
plt.ylabel('Outcome')
plt.show()
sns.boxplot(x='Outcome', y='Glucose', data=df)
plt.title('Boxplot of Glucose by Outcome')
plt.xlabel('Outcome')
plt.ylabel('Glucose')
plt.show()
# Glucose Distribution
hist = px.histogram(data_frame=df, x='Glucose', color='Outcome', title="Glucose Distribution", height=500,color_discrete_map={0: 'black', 1: 'orange'})
hist.update_layout({'title':{'x':0.5}})
hist.show();
# Insulin Distribution
import plotly.express as px
hist = px.histogram(data_frame=df, x="Insulin", color='Outcome', title="Insulin Distribution", height=500,color_discrete_map={0: 'black', 1: 'orange'})
hist.update_layout({'title':{'x':0.5}})
hist.show();
# BMI Distribution
import plotly.express as px
hist = px.histogram(data_frame=df, x="BMI", color='Outcome', title="BMI Distribution", height=500,color_discrete_map={0: 'black', 1: 'orange'})
hist.update_layout({'title':{'x':0.5}})
hist.show();
X = df.drop("Outcome", axis = 1)
y = df["Outcome"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = .2, random_state=0)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
models = [
RandomForestClassifier(n_estimators=100, random_state=42),
SVC(kernel='linear', C=1),
KNeighborsClassifier(n_neighbors=3),
LogisticRegression(random_state=42, max_iter=1000) # Addressed convergence warning
]
# Initialize an empty DataFrame to store results
results = pd.DataFrame(columns=["Model", "Train Accuracy", "Test Accuracy"])
# Train and evaluate each model, storing results in the DataFrame
for model in models:
name = model.__class__.__name__ # Access model name
model.fit(X_train, y_train) # Fit the model directly on the original training set
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate accuracy using original y_train
test_accuracy = accuracy_score(y_test, y_test_pred)
results.loc[len(results.index)] = {"Model": name, "Train Accuracy": train_accuracy, "Test Accuracy": test_accuracy}
print("{} Accuracy: {:.2f}%".format(name, test_accuracy * 100))
print("{} Classification Report:\n{}".format(name, classification_report(y_test, y_test_pred)))
print("\n" + "="*50 + "\n")
# Print the results DataFrame
print(results)
RandomForestClassifier Accuracy: 77.27%
RandomForestClassifier Classification Report:
precision recall f1-score support
0.0 0.83 0.84 0.84 107
1.0 0.63 0.62 0.62 47
accuracy 0.77 154
macro avg 0.73 0.73 0.73 154
weighted avg 0.77 0.77 0.77 154
==================================================
SVC Accuracy: 80.52%
SVC Classification Report:
precision recall f1-score support
0.0 0.82 0.92 0.87 107
1.0 0.74 0.55 0.63 47
accuracy 0.81 154
macro avg 0.78 0.73 0.75 154
weighted avg 0.80 0.81 0.80 154
==================================================
KNeighborsClassifier Accuracy: 69.48%
KNeighborsClassifier Classification Report:
precision recall f1-score support
0.0 0.77 0.80 0.79 107
1.0 0.50 0.45 0.47 47
accuracy 0.69 154
macro avg 0.63 0.63 0.63 154
weighted avg 0.69 0.69 0.69 154
==================================================
LogisticRegression Accuracy: 79.87%
LogisticRegression Classification Report:
precision recall f1-score support
0.0 0.83 0.90 0.86 107
1.0 0.71 0.57 0.64 47
accuracy 0.80 154
macro avg 0.77 0.74 0.75 154
weighted avg 0.79 0.80 0.79 154
==================================================
Model Train Accuracy Test Accuracy
0 RandomForestClassifier 1.000000 0.772727
1 SVC 0.762215 0.805195
2 KNeighborsClassifier 0.838762 0.694805
3 LogisticRegression 0.768730 0.798701